################# VMW firm to payroll provider firm DISTANCE FUNCTION ###############

### Read in the Data: 
#------------#CHOOSE sample size
sample_choice <- "10_percent_sample"

#CHOOSE balance requirement of the panel ("choice can be: unbalanced", "pre_post_1_t" or "pre_post_3_t")
suffix_mild_balanced = "pre_post_3_t"

output_tables_dir <- file.path(getwd(), "output/")

#-----------------------------------------------------------#

#Load dataset
df <- readRDS(paste0("data/derived/mild_balanced_", suffix_mild_balanced,"_stacked_nonpolicy_firm_payroll_provider_dataset_", sample_choice, ".rds"))

## Keep only Treatment = 23
df_trt_23 <- df %>%
  filter(trt_exp == 23)

## Locations of the Centroids to the Firms 
client_temp <- read.csv("data/payroll_provider/client_temp_centroids added.csv")

## Add the location values of the firms potentially affected by trt_exp == 23 to the df 
df_trt_23 <- df_trt_23 %>%
  left_join(client_temp, by = "clt_client")

############################### FINDING  DISTANCE ##################

# Keep firms with locations 
df_trt_23 <- df_trt_23 %>%
  filter(!is.na(long_centroid) & !is.na(lat_centroid))

## Just keep certain variables about the location OF FIRMS for storage space
zipcodes_trt_23 <- df_trt_23 %>% 
  select(clt_legalzip, long_centroid, lat_centroid, clt_division) 

## For conerns about large ram usage, this keeps the unique zipcodes of the firms. 
# If firm A and B are in the same zipcode, their distance to trt_23 facility is the same ... 
zipcodes_trt_23 <- zipcodes_trt_23 %>% 
  distinct(clt_legalzip, .keep_all = T)

### trt_23 FACILITIES
trt_23 <- read_rds("data/tmp_data/trt_23.rds")
stat_trt_23 <- trt_23  %>% 
  filter(date_opened < as.Date("2018-10-01") & permanent_staff >= 800 &
           (is.na(date_closed) | date_closed > as.Date("2019-04-01")))


#### Add the clt_division to the trt_23 dataframe 

state_division_table <- df_trt_23 %>%
  group_by(clt_division) %>%
  summarise(states = paste(unique(clt_legalstatefull), collapse = ", ")) %>%
  arrange(clt_division)

division_map <- list(
  "East North Central" = c("Illinois", "Indiana", "Ohio", "Michigan", "Wisconsin"),
  "East South Central" = c("Tennessee", "Kentucky", "Alabama", "Mississippi"), 
  "Middle Atlantic" = c("New York", "New Jersey", "Pennsylvania"), 
  "Mountain" = c("Arizona", "Colorado", "Wyoming", "Utah", "Nevada", "New Mexico", "Idaho", "Montana"),
  "New England" = c("Massachusetts", "Connecticut", "New Hampshire", "Rhode Island", "Maine", "Vermont"),
  "Pacific" = c("Washington", "California", "Oregon"), 
  "South Atlantic" = c("South Carolina", "Georgia", "Florida", "District Of Columbia", "Delaware", "Virginia", "Maryland", "North Carolina", "West Virginia"),
  "West North Central" = c("Missouri", "Minnesota", "Kansas", "Nebraska", "Iowa", "North Dakota", "South Dakota"), 
  "West South Central" = c("Oklahoma", "Texas", "Louisiana", "Arkansas")
)

## Function which matches a facility to a clt_division by the state the facility is located in 
find_division <- function(state) {
  division <- names(Filter(function(x) state %in% x, division_map))
  if (length(division) > 0) return(division) else return(NA)
}

stat_trt_23 <- stat_trt_23 %>%
  mutate(clt_division = sapply(state, find_division))

# Now, we have the clt_division of the trt_23 companies
stat_trt_23$clt_division <- unname(stat_trt_23$clt_division)


#### the distance function between two geocodes
distance_sf <- function(coords){
  lon1 <- as.numeric(coords[1])
  lat1 <- as.numeric(coords[2])
  lon2 <- as.numeric(coords[3])
  lat2 <- as.numeric(coords[4])
  point1 <- st_sfc(st_point(c(lon1, lat1)), crs = 4326)
  point2 <- st_sfc(st_point(c(lon2, lat2)), crs = 4326)
  distance <- st_distance(point1, point2, by_element = TRUE) 
  as.numeric(distance)
  
}

#### finding the minimum distance between zipcode centroids and facilities, which are within the same clt_division
## Make a table of the distances between a center and firm 
#stat_distance <- tibble()

# temporary datafile of zipcodes 
tmp_trt_23 <- zipcodes_trt_23 %>% 
  select(clt_legalzip, long_centroid, lat_centroid, clt_division)
tmp_trt_23$id <- 1
stat_trt_23$id <- 1
## Merge the zipcode of fimrms with the trt_23 location data, this is an m:m merge as the distance is calculated between a firm and all trt_23 centers within the same clt_divsion

## longitude = long_trt_23, latitude = lat_trt_23
tmp_trt_23 <- full_join(tmp_trt_23, stat_trt_23,  by = c("id", "clt_division"), 
                     select(id, serial, long_trt_23 = longitude,
                            lat_trt_23 = latitude))

## The output is every firm repeated for each trt_23 firm within the clt_division of the firm 
tmp_trt_23 <- tmp_trt_23 %>%
  mutate_at(c("longitude", "latitude", "long_centroid", "lat_centroid"),
            as.numeric)

## Make a list of coordinates to put into the distance function
list_coord_trt_23 <- as.list(tmp_trt_23 %>% select(long_centroid,lat_centroid,
                                             longitude, latitude) %>% as.matrix() %>%
                            t() %>% as.data.frame())
## Apply Distance Function, find the distance between each clt_division of the firm 
tmp_trt_23$distance <- sapply(list_coord_trt_23, distance_sf)


## Finding the min distance facility between a firm and trt_23 in a clt_division
tmp_min_trt_23 <- tmp_trt_23 %>% 
  group_by(clt_legalzip) %>% 
  slice_min(order_by = distance)

## Merge the total number of firms affected by the treatment == 23 with the distance to an trt_23 from their zipcode to an trt_23 in their clt
df_final_trt_23 <- df_trt_23 %>%
  left_join(tmp_min_trt_23, by = "clt_legalzip")


df_final_trt_23 <- df_final_trt_23 %>%
  mutate(
    distance_quantile = ntile(distance, 5)
  )


### NOTE: The distance variable is in meters 

## For ease in interpretation...

df_final_trt_23 <- df_final_trt_23 %>%
  mutate(
    distance_km = as.numeric(distance) / 1000)

quantiles_trt_23 <- quantile(df_final_trt_23$distance_km, probs = seq(0,1, by = 0.2))

quantiles_df_trt_23 <- data.frame(
  Quantile = names(quantiles_trt_23), 
  Km_Boundary = as.numeric(quantiles_trt_23)
)
write.csv(quantiles_df_trt_23, "data/tmp_data/quantiles_df_trt_23.csv")


write_csv(df_final, 
          paste0(project_dir, "data/tmp_data/", sample_choice, "_DIVISION_trt_23_distance_Q_clt.csv")
)



################################### TRT_19 #####################


################# Trt_19 REPLICATION WITH CHANGE IN DISTANCE FUNCTION ###############

#-----------------------------------------------------------#
#Load dataset
df <- readRDS(paste0("data/derived/mild_balanced_", suffix_mild_balanced,"_stacked_nonpolicy_firm_payroll_provider_dataset_", sample_choice, ".rds"))

## Keep only Treatment = 19
df_trt_19 <- df %>%
  filter(trt_exp == 19)

## Locations of the Centroids to the Firms 
client_temp <- read.csv("data/payroll_provider/client_temp_centroids added.csv")

## Add the location values of the firms to the df
df_trt_19 <- df_trt_19 %>%
  left_join(client_temp, by = "clt_client")


############################### FINDING  DISTANCE ##################

# Keep firms with locations 
df_trt_19 <- df_trt_19 %>%
  filter(!is.na(long_centroid) & !is.na(lat_centroid))

## ZIPCODES OF FIRMS 
zipcodes_trt_19 <- df_trt_19 %>% 
  select(clt_legalzip, long_centroid, lat_centroid, clt_division) 

## This keeps the unique zipcodes of the firms. 
# If firm A and B are in the same zipcode, their distance to trt_23 facility is the same ... 
zipcodes_trt_19 <- zipcodes_trt_19 %>% 
  distinct(clt_legalzip, .keep_all = T)

### VMW = 19 facilities
trt_19 <- read_rds("data/tmp_data/trt_19.rds")

stat_trt_19 <- trt_19  %>% 
  filter(date_opened < as.Date("2017-10-01") &
           (is.na(date_closed) | date_closed > as.Date("2018-04-01")))


#### Add the division to the trt_19 dataframe 

state_division_table <- df_trt_19 %>%
  group_by(clt_division) %>%
  summarise(states = paste(unique(clt_legalstatefull), collapse = ", ")) %>%
  arrange(clt_division)

division_map <- list(
  "East North Central" = c("Illinois", "Indiana", "Ohio", "Michigan", "Wisconsin"),
  "East South Central" = c("Tennessee", "Kentucky", "Alabama", "Mississippi"), 
  "Middle Atlantic" = c("New York", "New Jersey", "Pennsylvania"), 
  "Mountain" = c("Arizona", "Colorado", "Wyoming", "Utah", "Nevada", "New Mexico", "Idaho", "Montana"),
  "New England" = c("Massachusetts", "Connecticut", "New Hampshire", "Rhode Island", "Maine", "Vermont"),
  "Pacific" = c("Washington", "California", "Oregon"), 
  "South Atlantic" = c("South Carolina", "Georgia", "Florida", "District Of Columbia", "Delaware", "Virginia", "Maryland", "North Carolina", "West Virginia"),
  "West North Central" = c("Missouri", "Minnesota", "Kansas", "Nebraska", "Iowa", "North Dakota", "South Dakota"), 
  "West South Central" = c("Oklahoma", "Texas", "Louisiana", "Arkansas")
)

find_division <- function(state) {
  division <- names(Filter(function(x) state %in% x, division_map))
  if (length(division) > 0) return(division) else return(NA)
}

stat_trt_19 <- stat_trt_19 %>%
  mutate(clt_division = sapply(state, find_division))

stat_trt_19$clt_division <- unname(stat_trt_19$clt_division)


#### the distance function between two geocodes
distance_sf <- function(coords){
  lon1 <- as.numeric(coords[1])
  lat1 <- as.numeric(coords[2])
  lon2 <- as.numeric(coords[3])
  lat2 <- as.numeric(coords[4])
  point1 <- st_sfc(st_point(c(lon1, lat1)), crs = 4326)
  point2 <- st_sfc(st_point(c(lon2, lat2)), crs = 4326)
  distance <- st_distance(point1, point2, by_element = TRUE) 
  as.numeric(distance)
  
}

#### finding the minimum distance between zipcode centroids and trt_19 facilities

# temporary datafile of zipcodes 
tmp_trt_19 <- zipcodes_trt_19 %>% 
  select(clt_legalzip, long_centroid, lat_centroid, clt_division)
tmp_trt_19$id <- 1
stat_trt_19$id <- 1
## Merge the zipcode of firms with the trt_19 location data, this is an m:m merge as the distance is calculated between a firm and all trt_19 centers

## longitude = long_trt_19, latitude = lat_trt_19
tmp_trt_19 <- full_join(tmp_trt_19, stat_trt_19,  by = c("id", "clt_division"), 
                        select(id, serial, long_trt_19 = longitude,
                               lat_trt_19 = latitude))
## longitude = long_trt_19, latitude = lat_trt_19
tmp_trt_19 <- tmp_trt_19 %>%
  mutate_at(c("longitude", "latitude", "long_centroid", "lat_centroid"),
            as.numeric)

list_coord_trt_19 <- as.list(tmp_trt_19 %>% select(long_centroid,lat_centroid,
                                                   longitude, latitude) %>% as.matrix() %>%
                               t() %>% as.data.frame())
## Apply Distance Function
tmp_trt_19$distance <- sapply(list_coord_trt_19, distance_sf)

nan_trt_19 <- tmp_trt_19 %>%
  filter(is.na(distance))

write_csv(tmp_trt_19, 
          paste0(project_dir, "data/tmp data/", sample_choice, "_temp_trt_19_distance_Q_clt.csv")
)


## Finding the min distance facility
tmp_min_trt_19 <- tmp_trt_19 %>% 
  group_by(clt_legalzip) %>% 
  slice_min(order_by = distance)

df_final_trt_19 <- df_trt_19 %>%
  left_join(tmp_min_trt_19, by = "clt_legalzip")


## Now to make the quintiles

df_final_trt_19 <- df_final_trt_19 %>%
  mutate(
    distance_quantile = ntile(distance, 5)
  )


### NOTE: The distance variable is in meters 

df_final_trt_19 <- df_final_trt_19 %>%
  mutate(
    distance_km = as.numeric(distance) / 1000)

quantiles_trt_19 <- quantile(df_final_trt_19$distance_km, probs = seq(0,1, by = 0.2))
quantiles_df_trt_19 <- data.frame(
  Quantile = names(quantiles_trt_19), 
  Km_Boundary = as.numeric(quantiles_trt_19)
)
write.csv(quantiles_df_trt_19, "data/tmp_data/quantiles_df_trt_19.csv")



write_csv(df_final_trt_19, 
          paste0(project_dir, "data/tmp data/", sample_choice, "_DIVISION_trt_19_distance_Q_clt.csv")
)

################################# TRT_3 ####################
################# TRT_3 REPLICATION WITH CHANGE IN DISTANCE FUNCTION ###############

#-----------------------------------------------------------#
#Load dataset
 df <- readRDS(paste0("data/derived/mild_balanced_", suffix_mild_balanced,"_stacked_nonpolicy_firm_payroll_provider_dataset_", sample_choice, ".rds"))

## Keep only Treatment = 3
df_trt_3 <- df %>%
  filter(trt_exp == 3)

## Locations of the Centroids to the Firms 
client_temp <- read.csv("data/payroll_provider/client_temp_centroids added.csv")

## Add the location values of the firms to the df
df_trt_3 <- df_trt_3 %>%
  left_join(client_temp, by = "clt_client")


############################### FINDING  DISTANCE ##################

# Keep firms with locations 
df_trt_3 <- df_trt_3 %>%
  filter(!is.na(long_centroid) & !is.na(lat_centroid))

## ZIPCODES OF FIRMS 
zipcodes_trt_3 <- df_trt_3 %>% 
  select(clt_legalzip, long_centroid, lat_centroid, clt_division) 

## This keeps the unique zipcodes of the firms. 
# If firm A and B are in the same zipcode, their distance to trt_3 facility is the same ... 
zipcodes_trt_3 <- zipcodes_trt_3 %>% 
  distinct(clt_legalzip, .keep_all = T)

### trt_3 FACILITIES
trt_3 <- read_rds("data/tmp_data/trt_3.rds")

stat_trt_3 <- trt_3 %>% 
  filter(date_opened < as.Date("2018-03-01") &
           (is.na(date_closed) | date_closed > as.Date("2018-09-01")))


#### Add the division to the trt_3 dataframe 

state_division_table <- df_trt_3 %>%
  group_by(clt_division) %>%
  summarise(states = paste(unique(clt_legalstatefull), collapse = ", ")) %>%
  arrange(clt_division)

division_map <- list(
  "East North Central" = c("Illinois", "Indiana", "Ohio", "Michigan", "Wisconsin"),
  "East South Central" = c("Tennessee", "Kentucky", "Alabama", "Mississippi"), 
  "Middle Atlantic" = c("New York", "New Jersey", "Pennsylvania"), 
  "Mountain" = c("Arizona", "Colorado", "Wyoming", "Utah", "Nevada", "New Mexico", "Idaho", "Montana"),
  "New England" = c("Massachusetts", "Connecticut", "New Hampshire", "Rhode Island", "Maine", "Vermont"),
  "Pacific" = c("Washington", "California", "Oregon"), 
  "South Atlantic" = c("South Carolina", "Georgia", "Florida", "District Of Columbia", "Delaware", "Virginia", "Maryland", "North Carolina", "West Virginia"),
  "West North Central" = c("Missouri", "Minnesota", "Kansas", "Nebraska", "Iowa", "North Dakota", "South Dakota"), 
  "West South Central" = c("Oklahoma", "Texas", "Louisiana", "Arkansas")
)

find_division <- function(state) {
  division <- names(Filter(function(x) state %in% x, division_map))
  if (length(division) > 0) return(division) else return(NA)
}

stat_trt_3 <- stat_trt_3 %>%
  mutate(clt_division = sapply(state, find_division))

stat_trt_3$clt_division <- unname(stat_trt_3$clt_division)


#### the distance function between two geocodes
distance_sf <- function(coords){
  lon1 <- as.numeric(coords[1])
  lat1 <- as.numeric(coords[2])
  lon2 <- as.numeric(coords[3])
  lat2 <- as.numeric(coords[4])
  point1 <- st_sfc(st_point(c(lon1, lat1)), crs = 4326)
  point2 <- st_sfc(st_point(c(lon2, lat2)), crs = 4326)
  distance <- st_distance(point1, point2, by_element = TRUE) 
  as.numeric(distance)
  
}

#### finding the minimum distance between zipcode centroids and trt_3 facilities

# temporary datafile of zipcodes 
tmp_trt_3 <- zipcodes_trt_3 %>% 
  select(clt_legalzip, long_centroid, lat_centroid, clt_division)
tmp_trt_3$id <- 1
stat_trt_3$id <- 1
## Merge the zipcode of fimrms with the trt_3 location data, this is an m:m merge as the distance is calculated between a firm and all trt_3 centers

## longitude = long_trt_3, latitude = lat_trt_3
tmp_trt_3 <- full_join(tmp_trt_3, stat_trt_3,  by = c("id", "clt_division"), 
                         select(id, serial, long_trt_3 = longitude,
                                lat_trt_3 = latitude))
## longitude = long_trt_3, latitude = lat_trt_3
tmp_trt_3 <- tmp_trt_3 %>%
  mutate_at(c("longitude", "latitude", "long_centroid", "lat_centroid"),
            as.numeric)


## longitude and latitude are of trt_3
## Make a list of coordinates to put into the distance function
list_coord_trt_3 <- as.list(tmp_trt_3 %>% select(long_centroid,lat_centroid,
                                                     longitude, latitude) %>% as.matrix() %>%
                                t() %>% as.data.frame())
## Apply Distance Function
tmp_trt_3$distance <- sapply(list_coord_trt_3, distance_sf)


## Finding the min distance facility
tmp_min_trt_3 <- tmp_trt_3 %>% 
  group_by(clt_legalzip) %>% 
  slice_min(order_by = distance)

df_final_trt_3 <- df_trt_3 %>%
  left_join(tmp_min_trt_3, by = "clt_legalzip")


## Now to make the quintiles

df_final_trt_3 <- df_final_trt_3 %>%
  mutate(
    distance_quantile = ntile(distance, 5)
  )


### NOTE: The distance variable is in meters 

df_final_trt_3 <- df_final_trt_3 %>%
  mutate(
    distance_km = as.numeric(distance) / 1000)

quantiles_trt_3 <- quantile(df_final_trt_3$distance_km, probs = seq(0,1, by = 0.2))

quantiles_df_trt_3 <- data.frame(
  Quantile = names(quantiles_trt_3), 
  Km_Boundary = as.numeric(quantiles_trt_3)
)
write.csv(quantiles_df_trt_3, "data/tmp_data/quantiles_df_trt_3.csv")


write_csv(df_final_trt_3, 
          paste0(project_dir, "data/tmp data/", sample_choice, "_DIVISION_trt_3_distance_Q_clt.csv")
)